How I created the new Annotation Families Integrated (AFI) table

I downloaded from the Biomercator output the QTLs and meta-QTLs intervals in base pairs
I used the command bedextraxt from bedops to extract the genes in the QTLs and meta-QTLs intervals
The .bed reference file of the grapevine genome used to extract the genes is the V1 annotation mapped on the 12X.2 structural annotation downloaded from https://urgi.versailles.inra.fr/Species/Vitis/Genome-Browser . I downloaded each chromosome gff file and pasted into unique file and modified to format it like .bed file
Then I executed the following commands to sort bed files and extract genes in the intervals

for d in ./*/ ; do (cd "$d" && sed -i 's/^\<[0-9]*\>/chr&/g'  *_metaqtls ); done

for d in ./*/ ; do (cd "$d" && for f in *_metaqtls; do ~/PROGRAMS/bedops/bin/sort-bed $f > $f.sorted; done ); done

for d in ./*/ ; do (cd "$d" && split -l 1 *.sorted --additional-suffix=.ok ); done

for d in ./*/ ; do (cd "$d" && mkdir ready_for_bedextract ); done

for d in ./*/ ; do (cd "$d" && mv *.ok ready_for_bedextract ); done

for d in ./*/ ; do (cd "$d/ready_for_bedextract/" && for f in *.ok; do  mv $f `cut -f 4 $f` ; done ); done

for d in ./*/ ; do (cd "$d/ready_for_bedextract/" && for f in *; do  ~/PROGRAMS/bedops/bin/bedextract ../../../copy/V2.1_ok.bed $f > $f.bed ; done ); done

Then I loaded in R all the list of genes comprised in the meta-QTLs intervals

options(knitr.kable.NA = '')
setwd("X:/BIOINFORMATICS/Dropbox/PETER/METAQTL/QTLDB/analisi_mqtl/analisi_mqtl_070317/dataset_trascrittomica/mqtl_and_qtl_genes_bedops/metaqtl/")
filenames <- list.files(path = ".", pattern = ".bed")
kable(filenames, align="l")
flo_LG1_vey_1.bed
flo_LG1_vey_2.bed
flo_LG14_vey_1.bed
flo_LG14_vey_2.bed
flo_LG14_vey_3.bed
flo_LG2_vey_1.bed
flo_LG7_vey_1.bed
pheno_LG1_vey_1.bed
pheno_LG1_vey_2.bed
pheno_LG14_vey_2.bed
pheno_LG16_vey_1.bed
pheno_LG17_vey_1.bed
pheno_LG18_ripe+F-V_1.bed
pheno_LG2_vey_1.bed
pheno_LG2_vey_2.bed
pheno_LG2_vey_3.bed
pheno_LG2_vey_ver+Vr-Rp_3.bed
pheno_LG6_vey_1.bed
ripe_LG13_vey_bis_1.bed
ripe_LG13_vey_bis_2.bed
ripe_LG18_vey_1.bed
ripe_LG2_vey_1.bed
ripe_LG2_vey_2.bed
ripe_LG2_vey_3.bed
ripe_LG3_vey_1.bed
ver_LG1_vey_1.bed
ver_LG2_vey_1.bed
ver_LG2_vey_2.bed
ver_LG2_vey_3.bed
all_metaqtl <-  lapply(filenames, read.table, header = F, sep="\t", col.names = c("metaQTL","chr","start","end","gene")) ## load all the file in one command
names(all_metaqtl) <- filenames
names(all_metaqtl) <- gsub(pattern=".bed", replacement="", x=names(all_metaqtl)) ## remove .bed suffix
all_mqtl_merge <- Reduce(function(...) merge(...,by=c("gene","chr","start","end"),  all=T), all_metaqtl) ## merge the list of file in one big dataframe
colnames(all_mqtl_merge) <- c("gene","chr","start","end", names(all_metaqtl))
kable(all_mqtl_merge[1:6,1:18], align="l")
gene chr start end flo_LG1_vey_1 flo_LG1_vey_2 flo_LG14_vey_1 flo_LG14_vey_2 flo_LG14_vey_3 flo_LG2_vey_1 flo_LG7_vey_1 pheno_LG1_vey_1 pheno_LG1_vey_2 pheno_LG14_vey_2 pheno_LG16_vey_1 pheno_LG17_vey_1 pheno_LG18_ripe+F-V_1 pheno_LG2_vey_1
VIT_01s0011g00620 chr1 563253 566683 flo_LG1_vey_1
VIT_01s0011g00630 chr1 572047 573571 flo_LG1_vey_1
VIT_01s0011g00640 chr1 573650 579327 flo_LG1_vey_1
VIT_01s0011g00650 chr1 586438 589926 flo_LG1_vey_1
VIT_01s0011g00660 chr1 590563 592859 flo_LG1_vey_1
VIT_01s0011g00670 chr1 593582 596330 flo_LG1_vey_1

The QTL genes loaded were only the ones from QTLs related to phenology, identified from the pattern option in list.files command

options(knitr.kable.NA = '')
setwd("X:/BIOINFORMATICS/Dropbox/PETER/METAQTL/QTLDB/analisi_mqtl/analisi_mqtl_070317/dataset_trascrittomica/mqtl_and_qtl_genes_bedops/qtl/V1_on_12x.2")
filenames <- list.files(path = ".", pattern = ("VT|F-V|VB|VE|Vr|VP|V-R|Vr-Rp|F-R"))
kable(length(filenames), align="l")
56
all_qtl <-  lapply(filenames, read.table, header = F, sep="\t", col.names = c("QTL","chr","start","end","gene"))
names(all_qtl) <- filenames
names(all_qtl) <- gsub(pattern=".bed", replacement="", x=names(all_qtl)) ## remove .bed suffix
all_qtl_merge <- Reduce(function(...) merge(...,by=c("gene","chr","start","end"),  all=T), all_qtl)
colnames(all_qtl_merge) <- c("gene","chr","start","end", names(all_qtl))
kable(all_qtl_merge[1:6,1:18], align="l")
gene chr start end BayoCanha_2008_Vr_1 BayoCanha_2009_Vr_2 BayoCanha_2010_Vr-Rp_1 Carreno_2005_VT_3 Carreno_2007_VT_1 Carreno_2008_VT_2 Costantini_2003_F-R_1 Costantini_2003_F-V_1 Costantini_2003_F-V_2 Costantini_2003_F-V_3 Costantini_2003_V-R_1 Costantini_2003_V-R_2 Costantini_2003_VP_1 Costantini_2003_VT_1
VIT_00s0229g00010 chr2 5346706 5349092 BayoCanha_2008_Vr_1 BayoCanha_2009_Vr_2 BayoCanha_2010_Vr-Rp_1 Costantini_2003_V-R_1
VIT_00s0229g00020 chr2 5357779 5376350 BayoCanha_2008_Vr_1 BayoCanha_2009_Vr_2 BayoCanha_2010_Vr-Rp_1 Costantini_2003_V-R_1
VIT_00s0229g00030 chr2 5376351 5386049 BayoCanha_2008_Vr_1 BayoCanha_2009_Vr_2 BayoCanha_2010_Vr-Rp_1 Costantini_2003_V-R_1
VIT_00s0229g00040 chr2 5389560 5389838 BayoCanha_2008_Vr_1 BayoCanha_2009_Vr_2 BayoCanha_2010_Vr-Rp_1 Costantini_2003_V-R_1
VIT_00s0229g00050 chr2 5391527 5392605 BayoCanha_2008_Vr_1 BayoCanha_2009_Vr_2 BayoCanha_2010_Vr-Rp_1 Costantini_2003_V-R_1
VIT_00s0229g00060 chr2 5395355 5398517 BayoCanha_2008_Vr_1 BayoCanha_2009_Vr_2 BayoCanha_2010_Vr-Rp_1 Costantini_2003_V-R_1

Now I load the original AFI file as received from Sara

options(knitr.kable.NA = '')
## merge qtls and metaqtls (this command I cannot do it on the laptop, too memory consuming)
# all_qtls_metaqtls_V1_on_12x.2 <- merge(all_qtl_merge, all_mqtl_merge, all=T)
## I load the file elaborated on the server
# write.table(all_qtls_metaqtls_V1_on_12x.2, file="../all_qtls_metaqtls_V1_on_12x.2.txt", row.names=F, quote=F, sep="\t", dec=".")
all_qtls_metaqtls_V1_on_12x.2 <- read.table("X:/BIOINFORMATICS/Dropbox/PETER/METAQTL/QTLDB/analisi_mqtl/analisi_mqtl_070317/dataset_trascrittomica/all_qtls_metaqtls_V1_on_12x.2.txt", header=T)
afi_original <- read_excel("X:/BIOINFORMATICS/Dropbox/PETER/METAQTL/OLD-FILES/nuovi_database/Annotation_Families_Integrated_2016.xlsx", sheet= "Annotation_Families_Integrated")
kable(head(afi_original), align="l")
Nimblegen Probe_ID Unique ID Agilent Probe_ID Corvina_Annotation/Others old 12Xv1 name 12Xv0 ID Identical genes in 8X or other EST Probeset grapegen Chromosome position 12X Cardinality between 12Xv0 and v1 Cardinality between 8X and 12Xv1 Track 12X v1 Functional annotation Reference Network Gene Ontology Function - R
CHR3_JGVV88_67_T01 VIT_03s0088g00510 CUST_9884_PI428914343 JGVv88.67.t01 GSVIVT01037034001 GSVIVP00038603001 chr03_08696026_08697570 OK V1 hypothetical MADS-box type 1 alpha 1b (VviMADS1A1b) Grimplet, J. 2016 vv60042MADS Transcription Factor Activity hypothetical MADS-box type 1 alpha 1b (VviMADS1A1b)_VIT_03s0088g00510
CHR1_JGVV10_103_T01 VIT_01s0010g01500 CUST_26841_PI428914343 JGVv10.103.t01 GSVIVT01010218001 GSVIVP00021252001 chr01_17760880_17762318 OK V1 hypothetical MADS-box type 1 alpha 1d (VviMADS1A1d) Grimplet, J. 2016 vv60042MADS Transcription Factor Activity hypothetical MADS-box type 1 alpha 1d (VviMADS1A1d)_VIT_01s0010g01500
CHR3_JGVV88_59_T01 VIT_03s0088g00610 CUST_9891_PI428914343 JGVv88.59.t01 GSVIVT01037022001 GSVIVP00038593001 chr03_08820035_08820694 OK V1 hypothetical MADS-box type 1 alpha 1f (VviMADS1A1f) Grimplet, J. 2016 vv60042MADS Transcription Factor Activity hypothetical MADS-box type 1 alpha 1f (VviMADS1A1f)_VIT_03s0088g00610
CHR3_JGVV88_61_T01 VIT_03s0088g00590 CUST_9889_PI428914343 JGVv88.61.t01 GSVIVT01037026001 GSVIVP00038595001 chr03_08776002_08776646 OK V1 hypothetical MADS-box type 1 alpha 1g (VviMADS1A1g) Grimplet, J. 2016 vv60042MADS Transcription Factor Activity hypothetical MADS-box type 1 alpha 1g (VviMADS1A1g)_VIT_03s0088g00590
CHR12_JGVV142_18_T01 VIT_12s0142g00360 CUST_22066_PI428914343 JGVv142.18.t01 GSVIVT01000802001 GSVIVP00018932001 GSVIVP00018930001 VVTU5956_at chr12_00279426_00291956 merge2 V1 putative MADS-box Agamous 1 (VviAG1) Grimplet, J. 2016 vv60042MADS Transcription Factor Activity putative MADS-box Agamous 1 (VviAG1)_VIT_12s0142g00360
CHR10_JGVV3_187_T01 VIT_10s0003g02070 CUST_25054_PI428914343 JGVv3.187.t01 GSVIVT01021303001 TC62522 DT011330 chr10_03730392_03738386 OK mrna V1 putative MADS-box Agamous 2 (VviAG2) Grimplet, J. 2016 vv30009Flower_development vv60042MADS Transcription Factor Activity putative MADS-box Agamous 2 (VviAG2)_VIT_10s0003g02070
## remove useless information
afi_original_reduced <- afi_original[,c(2,13)]
afi_original_reduced <- dplyr::filter(afi_original_reduced, grepl("VIT",  `Unique ID`)) ## some filtering
## merge qtls and metaqtls with afi (this commands I cannot do it on the laptop, too memory consuming)
# all_qtls_metaqtls_V1_on_12x.2_with_annotation <- merge(all_qtls_metaqtls_V1_on_12x.2, afi_original_reduced, by.x = "gene", by.y="Unique ID", all=T)
# all_qtls_metaqtls_V1_on_12x.2_with_annotation <- merge(all_qtls_metaqtls_V1_on_12x.2, afi_original_reduced, all=T)
# kable(head(all_qtls_metaqtls_V1_on_12x.2_with_annotation), align="l")

# from here on I performed different merge to attach the new transcriptomic datasets to the afi file, until i get this big file with all the necessary information
new_afi_definitive <- read_excel("X:/BIOINFORMATICS/Dropbox/PETER/METAQTL/QTLDB/analisi_mqtl/analisi_mqtl_070317/dataset_trascrittomica/all_qtls_metaqtls_V1_on_12x.2_with_annotation.xlsx")
dim(new_afi_definitive)
## [1] 34009    97

The table is available at https://pietrod.shinyapps.io/Annotation_Families_Integrated/

Molecular Veraison

I received 6 different datasets of Pinot Noir and Cabernet Sauvignon cultivars RNA-Seq FPKM data for 3 different years of berries samples around veraison time
I load the original dataset to perform filtering and cleaning. We need to identify the molecular veraison for the different years and cultivars in order to align the datasets and perform clustering

options(knitr.kable.NA = '')
pn.12 <- read.table("X:/BIOINFORMATICS/Dropbox/PETER/METAQTL/QTLDB/analisi_mqtl/analisi_mqtl_070317/dataset_trascrittomica/nuovi_dataset/PN12_FPKM_table.txt", header=T, row.names=1)
pn.13 <- read.table("X:/BIOINFORMATICS/Dropbox/PETER/METAQTL/QTLDB/analisi_mqtl/analisi_mqtl_070317/dataset_trascrittomica/nuovi_dataset/PN13_FPKM_table.tsv", header=T, row.names=1)
pn.14 <- read.table("X:/BIOINFORMATICS/Dropbox/PETER/METAQTL/QTLDB/analisi_mqtl/analisi_mqtl_070317/dataset_trascrittomica/nuovi_dataset/PN14_FPKM_table.tab", header=T, row.names=1)
cs.12 <- read.table("X:/BIOINFORMATICS/Dropbox/PETER/METAQTL/QTLDB/analisi_mqtl/analisi_mqtl_070317/dataset_trascrittomica/nuovi_dataset/CS12_FPKM_table.txt", header=T, row.names=1)
cs.13 <- read.table("X:/BIOINFORMATICS/Dropbox/PETER/METAQTL/QTLDB/analisi_mqtl/analisi_mqtl_070317/dataset_trascrittomica/nuovi_dataset/CS13_FPKM_table.tsv", header=T, row.names=1)
cs.14 <- read.table("X:/BIOINFORMATICS/Dropbox/PETER/METAQTL/QTLDB/analisi_mqtl/analisi_mqtl_070317/dataset_trascrittomica/nuovi_dataset/CS14_FPKM_table.tab", header=T, row.names=1)
dim(pn.12)
## [1] 29971    30
dim(pn.13)
## [1] 29971    33
dim(pn.14)
## [1] 29971    36
dim(cs.12)
## [1] 29971    39
dim(cs.13)
## [1] 29971    42
dim(cs.14)
## [1] 29971    39
kable(pn.12[1:6,1:18],align="l")
PN_0_1_0 PN_0_2_0 PN_0_3_0 PN_A_1_1 PN_A_2_1 PN_A_3_1 PN_A_1_2 PN_A_2_2 PN_A_3_2 PN_A_1_3 PN_A_2_3 PN_A_3_3 PN_A_1_4 PN_A_2_4 PN_A_3_4 PN_A_1_5 PN_A_2_5 PN_A_3_5
VIT_12s0028g03890 6.982480 6.200700 6.599040 12.637400 6.984890 13.07730 8.741250 7.658590 8.0912300 14.24840 15.1014000 13.6873000 11.93210 11.91460 13.7933000 11.4142000 11.89900 13.9172000
VIT_00s0301g00130 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.000000 0.000000 0.0000000 0.00000 0.0000000 0.0000000 0.00000 0.00000 0.0000000 0.0000000 0.00000 0.0000000
VIT_15s0021g01700 0.976611 0.480536 0.936513 1.605550 1.082410 1.83607 2.363640 1.279090 1.7194200 1.15728 1.8783100 1.2871100 1.10023 1.53761 1.0152100 1.2928300 1.03396 1.0427500
VIT_03s0132g00350 0.105368 0.147524 0.000000 0.103088 0.128263 0.00000 0.165335 0.133831 0.0519011 0.00000 0.0498966 0.0434941 0.00000 0.00000 0.0443722 0.0685977 0.00000 0.0438471
VIT_17s0000g09240 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.000000 0.000000 0.0000000 0.00000 0.0000000 0.0000000 0.00000 0.00000 0.0000000 0.0000000 0.00000 0.0000000
VIT_08s0058g00890 11.724200 12.667300 11.477000 27.740300 15.504900 26.53580 24.262500 24.121700 23.9950000 47.95110 43.5116000 36.4103000 42.22530 41.69190 42.3358000 33.3064000 30.71960 36.8685000

The idea is to calculate the number of genes, in particular the marker transitions early up, that show a value of log2FC greater than 1.5 in the intervals around veraison, in order to identify what time interval shows the highest number of mt genes moving and displaying significant increase in the expression level, to be able to say when is molecular veraison occurring
I wrote an R function that starting from the raw datasets of FPKM perform filtering and cleaning according to Sara suggestions and then calculate the fold change of the mt genes between each pair of time point from T0 to T5 and return the number of genes with a log2FC higher than 1.5
The function works like this:

  1. load a raw dataset and identify the total number of columns (conditions + replicates)
  2. it creates a vector representing the number of columns to add to the dataset that will be filled with NA or keep according to the condition evaluated
  3. the number of column to add is equal to the total number of columns + 1 and the total number of columns + total number of columns divided by 3 (if the number of replicates is 3)
  4. the idea is to evaluate separately every sample and in the corresponding added column for the sample set NA if the FPKM value is less than 1 in at least 2 replicates
  5. the set of new columns is then evaluated by row and if all the columns of a row show NA, the entire row is removed
  6. the mean values of the remaining rows calculated by sample (3 replicates) is then obtained and a new dataframe is created with only the mean values
  7. this dataframe is subsetted based on the early up mt genes
  8. the fold change (FC) is calculated for every gene between a time point and the one before (T1/T0)
  9. log2 of the FC is applied and the number of genes showing FC > 1.5 is returned
molecular_veraison_windows <- function(filename) {
    dat <- read.table(file = paste0("../../../nuovi_dataset/",filename), header=T, row.names=1)
    clmn.n <- length(names(dat))
    col.to.add <- (length(names(dat)) + 1) : ((length(names(dat)) + (length(names(dat))/3)))
    dat[, col.to.add] <- (ifelse(sapply(seq(1,length(names(dat)),by=3),function(i) rowSums(dat[,i:(i+2)] < 1, na.rm=T) > 1),NA,"keep"))
    dat.1 <- dat[rowSums(is.na(dat[,col.to.add]))!=ncol(dat[,col.to.add]), ]
    dat.1 <- dat.1[,1:clmn.n]
    colnames(dat.1) <- paste0(rep("t", length(names(dat.1))), rep(0:((length(names(dat.1))/3)-1),each=3))
    # number of columns per group (1-3, 4-6)
    n <- 3
    # number of groups
    n_grp <- ncol(dat.1) / n
    # column indices (one vector per group)
    idx_grp <- split(seq(dat.1), rep(seq(n_grp), each = n))
    # calculate the row means for all groups
        res <- lapply(idx_grp, function(i) {
        # subset of the data frame
        tmp <- dat.1[i]
        # calculate row means
        rowMeans(tmp, na.rm = TRUE)
    })
    # transform list into a data frame
    dat2 <- as.data.frame(res)
    # extract names of first column of each group
    names_frst <- names(dat.1)[sapply(idx_grp, "[", 1)]
    # modify column names of new data frame
    names(dat2) <- names_frst
    library(readxl)
    m.t.all <- read_excel("../../../geni_marcatori_transizioni_copy.xlsx", sheet="all")
    early_up <- m.t.all[m.t.all$marker_transitions == "early_up",]
    dat2_early_up <- dat2[rownames(dat2) %in% early_up$ID,]

    dat2_early_up <- as.data.frame(dat2_early_up)

    dat2_early_up$FC0_1 <- dat2_early_up$t1 / dat2_early_up$t0
    dat2_early_up$FC1_2 <- dat2_early_up$t2 / dat2_early_up$t1
    dat2_early_up$FC2_3 <- dat2_early_up$t3 / dat2_early_up$t2
    dat2_early_up$FC3_4 <- dat2_early_up$t4 / dat2_early_up$t3
    dat2_early_up$FC4_5 <- dat2_early_up$t5 / dat2_early_up$t4

    dat2_early_up_FC <- log2(dat2_early_up[,n_grp:(n_grp+5)])

    print(dim(dat2_early_up_FC[dat2_early_up_FC$FC0_1 > 1.5 , ]))
    print(dim(dat2_early_up_FC[dat2_early_up_FC$FC1_2 > 1.5 , ]))
    print(dim(dat2_early_up_FC[dat2_early_up_FC$FC2_3 > 1.5 , ]))
    print(dim(dat2_early_up_FC[dat2_early_up_FC$FC3_4 > 1.5 , ]))
    print(dim(dat2_early_up_FC[dat2_early_up_FC$FC4_5 > 1.5 , ]))
}

Example of the function

# load the function
source("X:/BIOINFORMATICS/Dropbox/PETER/METAQTL/QTLDB/analisi_mqtl/analisi_mqtl_070317/dataset_trascrittomica/nuovi_dataset/molecular_veraison_windows.r")
molecular_veraison_windows("PN12_FPKM_table.txt")
## [1] 27  6
## [1] 0 6
## [1] 127   6
## [1] 4 6
## [1] 0 6

In this example (Pinot 2012) the molecular veraison is considered between T2 and T3 - 127 genes (do not consider the second number, it is just the number of columns of the dataset returned as result). The table displayed before the numbers is a snapshot of the original dataset cleaned and where the values of the replicates have been averaged
Results for both cultivars and all years

options(knitr.kable.NA = '')
all.results <- read_excel("X:/BIOINFORMATICS/Dropbox/PETER/METAQTL/QTLDB/analisi_mqtl/analisi_mqtl_070317/dataset_trascrittomica/nuovi_dataset/molecular_veraison-db.xlsx", sheet="all_for_R")
kable(all.results, align="l")
cultivar year t0 genes_number t1 genes_number__1 t2 genes_number__2 t3 genes_number__3 t4 genes_number__4 t5 results_mv
PN 2012 -36 27 -28 0 -18 127 -9 4 0 0 t3
PN 2013 -20 11 -14 52 -7 26 0 0 0 t2
PN 2014 -20 6 -14 38 -6 58 0 0 0 t3
CS 2012 -37 60 -28 20 -18 56 -9 70 0 0 t4
CS 2013 -26 2 -20 8 -13 88 -6 1 0 1 t3
CS 2014 -23 8 -14 18 -7 80 0 8 0 t3

Clusters

To perform clustering I used the software called Clust, available at https://github.com/BaselAbujamous/clust . It is a pyhton program which takes as input the raw datasets and perform filtering, normalization, centering and clustering. It can handle n amount of datasets with different time points or conditions
We ran Clust both on single datasets keeping all the time points, and on multiple datasets together after sincronizing the time points around the molecular veraison identified previously (and then reducing the number of time points)
The clustering process was run not on the entire dataset but only on the genes in the QTLs intervals, that I extracted from the new AFI file

options(knitr.kable.NA = '')
pn.12.qtl.genes <- pn.12[rownames(pn.12) %in% all_qtl_merge$gene, ]
kable(pn.12.qtl.genes[1:6,1:18], align="l")
PN_0_1_0 PN_0_2_0 PN_0_3_0 PN_A_1_1 PN_A_2_1 PN_A_3_1 PN_A_1_2 PN_A_2_2 PN_A_3_2 PN_A_1_3 PN_A_2_3 PN_A_3_3 PN_A_1_4 PN_A_2_4 PN_A_3_4 PN_A_1_5 PN_A_2_5 PN_A_3_5
VIT_03s0132g00350 0.105368 0.147524 0.00000 0.103088 0.128263 0.0000 0.165335 0.133831 0.0519011 0.0000 0.0498966 0.0434941 0.00000 0.0000 0.0443722 0.0685977 0.0000 0.0438471
VIT_03s0088g00890 0.158621 0.000000 0.00000 0.153564 0.000000 0.0000 0.000000 0.000000 0.0000000 0.0000 0.0000000 0.0000000 0.00000 0.0000 0.0000000 0.0000000 0.0000 0.0000000
VIT_02s0025g02400 7.086780 9.338490 6.76182 10.156600 8.812340 10.5243 9.226640 9.140880 9.2230700 11.8870 12.1128000 11.1657000 9.98114 12.1609 10.1839000 10.2910000 10.6120 10.4304000
VIT_03s0110g00560 0.000000 0.000000 0.00000 0.000000 0.000000 0.0000 0.000000 0.000000 0.0000000 0.0000 0.0000000 0.0000000 0.00000 0.0000 0.0000000 0.0000000 0.0000 0.0000000
VIT_02s0154g00030 0.905504 0.000000 0.00000 0.000000 0.000000 0.0000 0.000000 0.000000 0.0000000 0.0000 0.0000000 0.0000000 0.00000 0.0000 0.0000000 0.0000000 0.0000 0.0000000
VIT_02s0025g04120 9.655220 11.126800 7.81751 7.972000 10.320300 10.3643 10.103400 7.677600 7.3933500 12.1458 8.5257800 8.1271900 12.76890 16.8831 13.1693000 9.1092600 11.4073 9.5876300
dim(pn.12.qtl.genes)
## [1] 8091   30

The number of genes in all the QTLs intervals is 8091. The same subsetting has been applied to all 6 datasets
Clust was then ran with the same following commands. Before on single datasets without correction around molecular veraison

clust Data-pn_2012/ -n normalization-pn_2012.txt -r replicates_file-pn_2012.txt -fil-v 1.1 -fil-c 1 -fil-d 1 -np 18

And after correction when running on multiple datasets together, for example on the three years of Pinot together

clust Data-all-pn/ -n normalization-all-pn.txt -r replicates_file-all-pn.txt -fil-v 1.1 -fil-c 1 -fil-d 3 -np 18

Clustering results
The following table summarizes the results obtained for the clustering before and after centering around molecular veraison

options(knitr.kable.NA = '')
clust.results.before <- read_excel("X:/BIOINFORMATICS/Dropbox/PETER/METAQTL/QTLDB/analisi_mqtl/analisi_mqtl_070317/dataset_trascrittomica/nuovi_dataset/cluster_analysis-heatmap/clust/clust_results_summary.xlsx", sheet="before")
kable(clust.results.before, align="l")
cultivar year datasets total genes filtered clusters genes in clusters smallest largest not included folder
CS 2012 1 8091 5294 11 1413 11 778 3881 Results_28_Nov_17
CS 2013 1 8091 5253 14 1941 12 754 3312 Results_28_Nov_17
CS 2014 1 8091 5136 10 1907 25 810 3229 Results_28_Nov_17
PN 2012 1 8091 5223 15 2646 11 834 2577 Results_28_Nov_17
PN 2013 1 8091 5184 12 2076 25 763 3108 Results_28_Nov_17
PN 2014 1 8091 5091 13 2047 12 914 3044 Results_28_Nov_17
CS+PN 2012 2 8091 4981 9 806 11 293 4175 Results_29_Nov_17_2012
CS+PN 2013 2 8091 4947 7 893 12 424 4054 Results_29_Nov_17_2013
CS+PN 2014 2 8091 4901 8 727 12 321 4174 Results_29_Nov_17_2014
CS all 3 8091 4976 5 640 15 415 4336 Results_28_Nov_17_Data-all-cs
PN all 3 8091 4924 6 882 17 455 4042 Results_28_Nov_17_Data-all-pn
CS+PN all 6 8091 4714 4 471 11 244 4243 Results_28_Nov_17_Data-all
clust.results.after <- read_excel("X:/BIOINFORMATICS/Dropbox/PETER/METAQTL/QTLDB/analisi_mqtl/analisi_mqtl_070317/dataset_trascrittomica/nuovi_dataset/cluster_analysis-heatmap/clust/clust_results_summary.xlsx", sheet="after")
kable(clust.results.after, align="l")
cultivar year datasets total genes filtered clusters genes in clusters smallest largest not included folder
CS 2012 1 8091 5142 16 2110 12 875 3032 cs12
CS 2013 1 8091 5118 14 2147 11 890 2971 cs13
CS 2014 1 8091 5115 12 2127 17 885 2988 cs14
PN 2012 1 8091 5223 15 2646 11 834 2577 pn12
PN 2013 1 8091 5184 12 2076 25 763 3108 pn13
PN 2014 1 8091 5091 13 2047 12 914 3044 pn14
CS+PN 2012 2 8091 4922 6 528 15 322 4394 Results_01_Dec_17_2012
CS+PN 2013 2 8091 4880 8 895 12 425 3985 Results_01_Dec_17_2013
CS+PN 2014 2 8091 4893 6 894 11 363 3999 Results_01_Dec_17_2014
CS all 3 8091 4808 5 967 11 442 3841 Results_06_Dec_17_cs
PN all 3 8091 4865 9 1206 12 565 3659 Results_06_Dec_17_pn
CS+PN all 6 8091 4610 4 547 11 268 4063 Results_06_Dec_17_all

Some example plots from the clustering results of Clust
Before I load the genes (VIT) placed in the clusters for each separate datasets

options(knitr.kable.NA = '')
setwd("X:/BIOINFORMATICS/Dropbox/PETER/METAQTL/QTLDB/analisi_mqtl/analisi_mqtl_070317/dataset_trascrittomica/nuovi_dataset/cluster_analysis-heatmap/clust/Results/Before-correction/Results_28_Nov_17/")
filenames <- list.files(path = ".",pattern=("ts.tsv"), recursive=T)
all_clust <-  lapply(filenames, read.table, head=T, sep="\t", na.strings="")
names(all_clust) <- filenames
names(all_clust) <- gsub(pattern="Results_28_Nov_17_", replacement="", x=names(all_clust))
names(all_clust) <- gsub(pattern="/Clusters_Objects.tsv", replacement="", x=names(all_clust))
## rimuovo la seconda riga (quella con scritto Genes) da tutti i df ##
all_clust <- lapply(all_clust, function(x) x[2:nrow(x),])
dim(all_clust)
## NULL

Then I load all the data processed by Clust that the program used to plot the results (cleaned, filtered, centered, normalized)
To correct

Heatmaps of Pinot up-regulated genes around veraison

options(knitr.kable.NA = '')
setwd("X:/BIOINFORMATICS/Dropbox/PETER/METAQTL/QTLDB/analisi_mqtl/analisi_mqtl_070317/dataset_trascrittomica/nuovi_dataset/cluster_analysis-heatmap/clust/Results/Before-correction/Results_28_Nov_17/")
cl.pn12 <- read.table("Results_28_Nov_17_PN-2012/Clusters_Objects.tsv", head=T, sep="\t", na.strings="")
cl.pn12 <- cl.pn12[2:nrow(cl.pn12),]
dim(cl.pn12)
## [1] 834  15
cl.pn12.up <- cl.pn12[,c(3,4)]
head(cl.pn12.up)
##       C2..21.genes.    C3..134.genes.
## 2 VIT_01s0011g00710 VIT_01s0011g01520
## 3 VIT_02s0025g03220 VIT_01s0011g01780
## 4 VIT_02s0025g04580 VIT_01s0011g04240
## 5 VIT_02s0025g04850 VIT_01s0011g04620
## 6 VIT_02s0025g04860 VIT_01s0011g05000
## 7 VIT_03s0091g00260 VIT_01s0011g05030
cl.pn13 <- read.table("Results_28_Nov_17_PN-2013/Clusters_Objects.tsv", head=T, sep="\t", na.strings="")
cl.pn13 <- cl.pn13[2:nrow(cl.pn13),]
dim(cl.pn13)
## [1] 763  12
cl.pn13.up <- cl.pn13[,c(6:9)]
head(cl.pn13.up)
##       C5..25.genes.     C6..54.genes.     C7..35.genes.     C8..73.genes.
## 2 VIT_01s0011g01070 VIT_01s0011g00620 VIT_00s0229g00160 VIT_01s0011g01300
## 3 VIT_01s0011g02310 VIT_01s0011g00710 VIT_01s0011g03910 VIT_01s0011g02730
## 4 VIT_02s0012g02670 VIT_01s0011g03220 VIT_01s0011g05180 VIT_01s0011g02990
## 5 VIT_02s0025g01010 VIT_01s0011g04030 VIT_01s0011g05240 VIT_01s0011g03610
## 6 VIT_02s0241g00090 VIT_01s0011g04250 VIT_01s0011g05250 VIT_01s0011g04280
## 7 VIT_03s0063g00580 VIT_01s0011g04640 VIT_01s0137g00240 VIT_01s0011g04790
cl.pn14 <- read.table("Results_28_Nov_17_PN-2014/Clusters_Objects.tsv", head=T, sep="\t", na.strings="")
cl.pn14 <- cl.pn14[2:nrow(cl.pn14),]
dim(cl.pn14)
## [1] 914  13
cl.pn14.up <- cl.pn14[,c(8:11)]
head(cl.pn14.up)
##       C7..52.genes.     C8..18.genes.     C9..23.genes.    C10..61.genes.
## 2 VIT_01s0011g01140 VIT_01s0011g03920 VIT_01s0011g01520 VIT_00s0226g00090
## 3 VIT_01s0011g01510 VIT_01s0011g05590 VIT_01s0011g03480 VIT_00s0282g00030
## 4 VIT_01s0011g03910 VIT_02s0025g01430 VIT_01s0011g04490 VIT_01s0011g04350
## 5 VIT_01s0011g04240 VIT_02s0025g02360 VIT_02s0012g00630 VIT_01s0011g05830
## 6 VIT_01s0011g06250 VIT_03s0063g00820 VIT_02s0025g02140 VIT_01s0137g00300
## 7 VIT_01s0137g00180 VIT_05s0102g01130 VIT_02s0025g02420 VIT_02s0012g00480

Which and how many up-regulated genes are in common between all the 3 years in Pinot clusters as obtained by Clust on each separate datasets?

options(knitr.kable.NA = '')
## I already know that these are 13 =) 
thirteen <- sort(intersect(intersect(unique(as.vector(as.matrix(cl.pn12.up[,c(1:2)]))), unique(as.vector(as.matrix(cl.pn13.up[,c(1:4)])))), unique(as.vector(as.matrix(cl.pn14.up[,c(1:4)])))))
kable(thirteen, align="l")
VIT_02s0025g01610
VIT_03s0063g01870
VIT_03s0091g00240
VIT_12s0059g00670
VIT_14s0006g01660
VIT_16s0098g00790
VIT_17s0000g05550
VIT_18s0001g09500
VIT_18s0001g09900
VIT_18s0001g11260
VIT_18s0001g12480
VIT_18s0001g12510
VIT_18s0001g13060
## Who are these genes?
kable(new_afi_definitive[new_afi_definitive$gene %in% thirteen , c(1,2,3,90:97)], align="l", row.names=F)
gene chr start chr_position_12X annotation_V1 DEG_common switch_common NAC DEG_atlas switch_atlas marker_transition
VIT_02s0025g01610 2 1372499 chr02_01560083_01563233 Phosphatidylinositol 4-kinase type-II marker_transition
VIT_03s0063g01870 3 4236634 chr03_05200808_05203881 Cold acclimation protein WCOR413 protein beta form marker_transition
VIT_03s0091g00240 3 6521904 chr03_06521904_06537905 Haloacid dehalogenase hydrolase
VIT_12s0059g00670 12 5504434 chr12_05504434_05508516 NADP-dependent oxidoreductase DEG_atlas
VIT_14s0006g01660 14 18041242 chr14_18041242_18104242 Pyridoxal kinase
VIT_16s0098g00790 16 22650462 chr16_21130941_21133087 Copper-binding family protein DEG_common marker_transition
VIT_17s0000g05550 17 6183508 chr17_06051880_06063502 Proton-dependent oligopeptide transport (POT) family protein DEG_common DEG_atlas
VIT_18s0001g09500 18 7942455 chr18_07942455_07953002 CYP81B2v1
VIT_18s0001g09900 18 8275291 chr18_08275291_08275974 No hit
VIT_18s0001g11260 18 9571971 chr18_09571971_09578214 Nucleobase-ascorbate transporter 3 (NAT3)
VIT_18s0001g12480 18 10670045 chr18_10670045_10675852 Metal transporter CNNM4 (Cyclin-M4)
VIT_18s0001g12510 18 10686816 chr18_10686816_10697439 5’ nucleotidase marker_transition
VIT_18s0001g13060 18 11153980 chr18_11153980_11161250 C3H2C3 ring-finger protein

Now I retrieve the up-regulated genes as obtained by Clust on the 3 years datasets integrated analysis

setwd("X:/BIOINFORMATICS/Dropbox/PETER/METAQTL/QTLDB/analisi_mqtl/analisi_mqtl_070317/dataset_trascrittomica/nuovi_dataset/cluster_analysis-heatmap/clust/Results/After-correction/Results_06_Dec_17/Results_06_Dec_17_pn/")
pn.up.together <- read.table("Clusters_Objects.tsv", head=T, sep="\t", na.strings="")
pn.up.together <- pn.up.together[2:nrow(pn.up.together),]
dim(pn.up.together)
## [1] 565   9
head(pn.up.together)
##      C0..451.genes.     C1..29.genes.     C2..23.genes.     C3..45.genes.
## 2 VIT_00s0226g00040 VIT_01s0011g02040 VIT_00s0229g00030 VIT_00s1338g00020
## 3 VIT_00s0229g00060 VIT_01s0011g05730 VIT_01s0011g02810 VIT_01s0011g01290
## 4 VIT_00s0555g00050 VIT_02s0012g01730 VIT_02s0025g03560 VIT_01s0011g02310
## 5 VIT_01s0011g00580 VIT_02s0025g02040 VIT_02s0033g01310 VIT_01s0011g04050
## 6 VIT_01s0011g00650 VIT_02s0025g02760 VIT_02s0087g00130 VIT_01s0011g05850
## 7 VIT_01s0011g00700 VIT_03s0091g01230 VIT_03s0038g02340 VIT_01s0011g06030
##       C4..22.genes.     C5..17.genes.     C6..42.genes.    C7..565.genes.
## 2 VIT_01s0011g02320 VIT_01s0011g00710 VIT_00s0229g00140 VIT_00s0229g00100
## 3 VIT_01s0137g00670 VIT_01s0011g01520 VIT_00s0229g00160 VIT_00s0229g00180
## 4 VIT_02s0025g04250 VIT_01s0011g04250 VIT_01s0011g00550 VIT_00s0965g00010
## 5 VIT_02s0025g04260 VIT_01s0011g04640 VIT_01s0011g00850 VIT_01s0011g00590
## 6 VIT_03s0063g00820 VIT_01s0011g06250 VIT_01s0011g01640 VIT_01s0011g00610
## 7 VIT_03s0088g00710 VIT_02s0025g00470 VIT_01s0011g03610 VIT_01s0011g00660
##       C8..12.genes.
## 2 VIT_02s0025g03100
## 3 VIT_05s0094g01280
## 4 VIT_07s0104g00130
## 5 VIT_11s0016g05100
## 6 VIT_12s0059g02270
## 7 VIT_13s0084g00810

Among these clusters the profiles we are interested in are the ones from clusters C3 and C4

options(knitr.kable.NA = '')
## I already know that these are 62 =) 
sixtytwo <- sort(unique(as.vector(as.matrix(pn.up.together[,c(4,6)]))))
kable(sixtytwo)
VIT_00s1338g00020
VIT_01s0011g00710
VIT_01s0011g01290
VIT_01s0011g01520
VIT_01s0011g02310
VIT_01s0011g04050
VIT_01s0011g04250
VIT_01s0011g04640
VIT_01s0011g05850
VIT_01s0011g06030
VIT_01s0011g06250
VIT_02s0012g00760
VIT_02s0012g01100
VIT_02s0012g01630
VIT_02s0025g00470
VIT_02s0025g01400
VIT_02s0025g03310
VIT_02s0025g03510
VIT_02s0025g04340
VIT_02s0033g01330
VIT_02s0109g00230
VIT_03s0038g02190
VIT_03s0063g00360
VIT_03s0063g01870
VIT_03s0063g02410
VIT_03s0063g02510
VIT_05s0029g00140
VIT_05s0094g00350
VIT_05s0094g01110
VIT_05s0094g01210
VIT_05s0102g00440
VIT_05s0102g01190
VIT_05s0124g00270
VIT_06s0009g02640
VIT_06s0009g03190
VIT_09s0070g00490
VIT_11s0016g03200
VIT_11s0016g03720
VIT_11s0118g00230
VIT_11s0118g00240
VIT_12s0059g00810
VIT_12s0059g01080
VIT_13s0084g00050
VIT_14s0068g00640
VIT_16s0098g00790
VIT_17s0000g04890
VIT_17s0000g05580
VIT_17s0000g05960
VIT_17s0000g06300
VIT_17s0000g07490
VIT_17s0000g07610
VIT_17s0000g07920
VIT_18s0001g00890
VIT_18s0001g02250
VIT_18s0001g02970
VIT_18s0001g03950
VIT_18s0001g08490
VIT_18s0001g11930
VIT_18s0001g13950
VIT_18s0041g00740
VIT_18s0075g00510
VIT_18s0122g00410
## Who are these genes?
kable(new_afi_definitive[new_afi_definitive$gene %in% sixtytwo , c(1,2,3,90:97)], align="l")
gene chr start chr_position_12X annotation_V1 DEG_common switch_common NAC DEG_atlas switch_atlas marker_transition
VIT_01s0011g00710 1 616107 chr01_00616107_00618157 Zinc finger (C2H2 type) family
VIT_01s0011g01290 1 1098416 chr01_01098416_01102667 N-carbamyl-L-amino acid amidohydrolase
VIT_01s0011g01520 1 1336774 chr01_01336774_01340483 Peroxisomal 2,4-dienoyl-CoA reductase
VIT_01s0011g02310 1 2088704 chr01_02088704_02090176 Unknown protein marker_transition
VIT_01s0011g04050 1 3698303 chr01_03698303_03708992 Nucleotidyltransferase
VIT_01s0011g04250 1 3854482 chr01_03854482_03857576 Zinc finger (B-box type)
VIT_01s0011g04640 1 4174779 chr01_04174779_04176173 Auxin Efflux Carrier
VIT_01s0011g05850 1 5609439 chr01_05609439_05614892 SEC14 cytosolic factor
VIT_01s0011g06030 1 5786224 chr01_05786224_05797572 Myrosinase precursor
VIT_01s0011g06250 1 6043632 chr01_06043632_06047154 Xyloglucan endotransglycosylase
VIT_02s0025g00470 2 369467 chr02_00557051_00570448 EMB3001 (embryo defective 3001) marker_transition
VIT_02s0025g01400 2 1153133 chr02_01340717_01348463 Leucine-rich repeat family protein stress
VIT_02s0025g03310 2 2633914 chr02_02821498_02823906 Arsenite transport protein (ArsB) DEG_common
VIT_02s0025g03510 2 2815103 chr02_03002687_03014242 Bromo-adjacenty (BAH) domain-containing protein
VIT_02s0025g04340 2 3637347 chr02_03824931_03825899 Osmotin DEG_common switch_common DEG_atlas switch_atlas
VIT_00s1338g00020 2 5343106 chrUn_38606407_38609245 Protein transport protein Sec61 subunit alpha
VIT_02s0012g00760 2 6778127 chr02_06666128_06670079 Haloacid dehalogenase hydrolase marker_transition
VIT_02s0012g01100 2 7175167 chr02_07063168_07078721 Protein phosphatase 2C POLTERGEIST-like 1
VIT_02s0012g01630 2 7998696 chr02_07886697_07889877 Transmembrane protein 41B marker_transition
VIT_02s0109g00230 2 12786746 chr02_12674747_12696922 Early-responsive to dehydration protein / ERD protein marker_transition
VIT_02s0033g01330 2 16897866 chr02_16785867_16789390 Acyl-CoA binding protein DEG_common switch_common marker_transition
VIT_03s0038g02190 3 1502133 chr03_01502133_01506299 Nodulin
VIT_03s0063g02510 3 3696360 chr03_05738865_05744155 Protein kinase
VIT_03s0063g02410 3 3783389 chr03_05641144_05657126 Switching protein 3D ATSWI3D
VIT_03s0063g01870 3 4236634 chr03_05200808_05203881 Cold acclimation protein WCOR413 protein beta form marker_transition
VIT_03s0063g00360 3 5511388 chr03_03918238_03929127 Glycoprotein 3-alpha-L-fucosyltransferase marker_transition
VIT_05s0029g00140 5 14717848 chr05_14296111_14297784 ERF/AP2 Gene Family (VvERF005),Dehydration Responsive Element-Binding Transcription Factor (VvDREB05) DEG_common DEG_atlas
VIT_05s0124g00270 5 21236964 chr05_20815227_20843551 Unknown protein
VIT_05s0102g00440 5 22815958 chr05_22394221_22398999 FAR1-related sequence 5
VIT_05s0102g01190 5 23751980 chr05_23330243_23338137 Calreticulin-3 precursor marker_transition
VIT_05s0094g00350 5 24291499 chr05_23662399_23675612 Chitinase class IV DEG_common
VIT_05s0094g01110 5 25073598 chr05_24444498_24453092 Acetolactate synthase small subunit
VIT_05s0094g01210 5 25136361 chr05_24507261_24509135 Amine oxidase
VIT_06s0009g02640 6 16547312 chr06_15422302_15431585 Hydroxymethylglutaryl-CoA lyase
VIT_06s0009g03190 6 17503185 chr06_16378175_16390845 Unknown protein
VIT_09s0070g00490 9 13760643 chr09_13760643_13784740 Protein phosphatase 2C
VIT_11s0016g03200 11 2568013 chr11_02568013_02573551 Gibberellin oxidase marker_transition
VIT_11s0016g03720 11 3049799 chr11_03049799_03055105 Aspartate aminotransferase, cytoplasmic (Transaminase A) marker_transition
VIT_11s0118g00230 11 5860792 chr11_05843896_05869373 ATSYTE/NTMC2T2.1/NTMC2TYPE2.1/SYTE
VIT_11s0118g00240 11 5886913 chr11_05870017_05909727 Purple acid phosphatase 26 marker_transition
VIT_12s0059g00810 12 5609780 chr12_05609780_05612228 B-cell receptor-associated protein 31
VIT_12s0059g01080 12 5990364 chr12_05990364_05993590 Acid phosphatase/vanadium-dependent haloperoxidase marker_transition
VIT_13s0084g00050 13 21846264 chr13_18556390_18561669 Cysteine proteinase inhibitor marker_transition
VIT_14s0068g00640 14 24438706 chr14_24438706_24450994 Acetyl-CoA synthetase marker_transition
VIT_16s0098g00790 16 22650462 chr16_21130941_21133087 Copper-binding family protein DEG_common marker_transition
VIT_17s0000g04890 17 5409996 chr17_05278368_05292271 D-aminoacyl-tRNA deacylase GEKO1
VIT_17s0000g05580 17 6213229 chr17_06081601_06089504 Isopiperitenol dehydrogenase DEG_common DEG_atlas marker_transition
VIT_17s0000g05960 17 6630398 chr17_06498770_06499583 Pectinesterase family
VIT_17s0000g06300 17 6963086 chr17_06831458_06832686 No hit marker_transition
VIT_17s0000g07490 17 8555284 chr17_08423656_08438725 Unknown protein
VIT_17s0000g07610 17 8721441 chr17_08589813_08592495 B12D protein
VIT_17s0000g07920 17 9030527 chr17_08898899_08901368 Hypoxia-responsive
VIT_18s0122g00410 18 368562 chr18_00368562_00375676 PPI1 (proton pump interactor 1
VIT_18s0001g00890 18 1619182 chr18_01619182_01620812 Pentatricopeptide (PPR) repeat-containing
VIT_18s0001g02970 18 3057427 chr18_03057427_03063862 Glucose-6-phosphate/phosphate translocator related
VIT_18s0001g03950 18 3587990 chr18_03587990_03596239 Pm27 protein marker_transition
VIT_18s0001g08490 18 6935594 chr18_06935594_06939798 Early-responsive to dehydration marker_transition
VIT_18s0001g11930 18 10174236 chr18_10174236_10175399 Thaumatin SCUTL2 DEG_common switch_common DEG_atlas marker_transition
VIT_18s0001g13950 18 11940712 chr18_11940712_11949407 RNA polymerase Rpa43 subunit
VIT_18s0001g02250 18 20579102 chr18_random_02618009_02632515 Ras-related protein Rab-7A
VIT_18s0075g00510 18 24952214 chr18_22007280_22010150 Calpain alkylated DNA repair protein alkB homolog 6
VIT_18s0041g00740 18 28678219 chr18_25261198_25262924 UDP-glucose: anthocyanidin 5,3-O-glucosyltransferase DEG_common DEG_atlas marker_transition

Now I try to plot these genes as a heatmap before with raw values and then with the values normalized and centered by Clust
Example with Pinot 2012. I use the function I wrote to clean RNAseq datasets to clean PN12 raw dataset

Raw FPKM

options(knitr.kable.NA = '')
source("X:/BIOINFORMATICS/Dropbox/PETER/METAQTL/QTLDB/analisi_mqtl/analisi_mqtl_070317/dataset_trascrittomica/nuovi_dataset/clean_RNAseq_datasets_windows.r")
dim(pn.12)
## [1] 29971    30
pn.12.cleaned <- clean_RNAseq_datasets_windows("PN12_FPKM_table.txt")
dim(pn.12.cleaned)
## [1] 19437    10
pn.12.cleaned$ID <- rownames(pn.12.cleaned)
pn.12.cleaned.log <- log2(pn.12.cleaned[,1:10])
pn.12.cleaned.log$ID <- rownames(pn.12.cleaned.log)
## raw raw values
kable(head(pn.12.cleaned[rownames(pn.12.cleaned) %in% thirteen , -11 ]),align="l")
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9
VIT_03s0091g00240 7.385783 10.469067 13.379533 21.721367 22.487500 18.16630 14.483003 8.084137 7.617170 6.1112433
VIT_12s0059g00670 1.930513 1.631103 1.411227 3.090993 4.838617 4.30138 3.271703 2.320103 1.556464 0.4801953
VIT_18s0001g09900 9.452897 10.266453 7.660330 24.494367 29.082300 21.45623 14.472933 10.855890 6.323533 2.8722033
VIT_14s0006g01660 16.221000 17.173633 17.008500 25.092033 22.168833 19.33003 15.489000 17.890900 12.078300 11.0676567
VIT_18s0001g11260 85.197933 113.945667 109.638333 213.294667 255.716333 223.99800 195.114333 149.810000 136.956333 101.9110333
VIT_17s0000g05550 8.238217 11.011500 13.010067 21.851267 22.387833 20.84993 16.842800 11.429910 9.461167 6.4447100
ggplot(melt(pn.12.cleaned[rownames(pn.12.cleaned) %in% thirteen , ], id.vars="ID"), aes(x=variable, y=value, group=ID)) + geom_line() + theme_bw() + ggtitle("pn12 thirteen raw FPKM")

heatmap.2(as.matrix(pn.12.cleaned[rownames(pn.12.cleaned) %in% thirteen , c(1:10) ]), Colv=F, trace="none",margins = c(8, 16))

kable(head(pn.12.cleaned[rownames(pn.12.cleaned) %in% sixtytwo, ]),align="l")
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 ID
VIT_01s0011g00710 0.691529 1.68638 1.105790 2.980277 3.262777 2.466297 2.390553 2.003363 2.071133 1.561437 VIT_01s0011g00710
VIT_05s0102g00440 4.732187 6.34187 6.359550 6.177623 6.803440 6.351583 6.148123 6.223867 6.284473 6.447620 VIT_05s0102g00440
VIT_00s1338g00020 51.160933 70.88963 74.754633 111.880667 106.706000 113.289000 101.811033 95.869667 117.425000 99.276667 VIT_00s1338g00020
VIT_12s0059g01080 3.852520 4.36186 3.812033 6.606327 9.338753 10.905913 11.511100 15.190033 12.158700 11.441933 VIT_12s0059g01080
VIT_01s0011g06030 23.900000 24.05233 24.394900 27.139933 29.504100 31.305567 27.922933 27.137500 35.179167 34.854067 VIT_01s0011g06030
VIT_11s0016g03200 15.827533 25.68413 18.611067 50.675533 71.516667 74.403067 66.719667 72.140933 85.163067 92.235500 VIT_11s0016g03200
ggplot(melt(pn.12.cleaned[rownames(pn.12.cleaned) %in% sixtytwo, ], id.vars="ID"), aes(x=variable, y=value, group=ID)) + geom_line() + theme_bw() + ggtitle("pn12 sixtytwo raw FPKM")

heatmap.2(as.matrix(pn.12.cleaned[rownames(pn.12.cleaned) %in% sixtytwo , c(1:10) ]), Colv=F, trace="none",margins = c(8, 16), cexRow=0.4)

## log2 version
kable(head(pn.12.cleaned.log[rownames(pn.12.cleaned.log) %in% thirteen , ]),align="l")
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 ID
VIT_03s0091g00240 2.8847509 3.3880609 3.7419559 4.441043 4.491051 4.183193 3.856289 3.015094 2.9292551 2.611466 VIT_03s0091g00240
VIT_12s0059g00670 0.9489845 0.7058482 0.4969497 1.628070 2.274595 2.104800 1.710042 1.214189 0.6382719 -1.058307 VIT_12s0059g00670
VIT_18s0001g09900 3.2407565 3.3598660 2.9374065 4.614378 4.862069 4.423325 3.855285 3.440406 2.6607309 1.522158 VIT_18s0001g09900
VIT_14s0006g01660 4.0197909 4.1021234 4.0881840 4.649158 4.470461 4.272772 3.953172 4.161154 3.5943455 3.468278 VIT_14s0006g01660
VIT_18s0001g11260 6.4127465 6.8322022 6.7766085 7.736704 7.998400 7.807342 7.608176 7.226990 7.0975722 6.671166 VIT_18s0001g11260
VIT_17s0000g05550 3.0423321 3.4609391 3.7015564 4.449645 4.484643 4.381971 4.074060 3.514742 3.2420181 2.688115 VIT_17s0000g05550
ggplot(melt(pn.12.cleaned.log[rownames(pn.12.cleaned.log) %in% thirteen , ], id.vars="ID"), aes(x=variable, y=value, group=ID)) + geom_line() + theme_bw() + ggtitle("pn12 thirteen log2 FPKM")

heatmap.2(as.matrix(pn.12.cleaned.log[rownames(pn.12.cleaned.log) %in% thirteen , c(1:10) ]), Colv=F, trace="none",margins = c(8, 16))

kable(head(pn.12.cleaned.log[rownames(pn.12.cleaned.log) %in% sixtytwo, ]),align="l")
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 ID
VIT_01s0011g00710 -0.5321383 0.7539297 0.1450774 1.575446 1.706100 1.302346 1.257345 1.002424 1.050420 0.6428741 VIT_01s0011g00710
VIT_05s0102g00440 2.2425070 2.6649083 2.6689247 2.627052 2.766264 2.667116 2.620146 2.637811 2.651792 2.6887667 VIT_05s0102g00440
VIT_00s1338g00020 5.6769707 6.1475028 6.2240911 6.805817 6.737497 6.823864 6.669750 6.583003 6.875596 6.6333828 VIT_00s1338g00020
VIT_12s0059g01080 1.9458024 2.1249435 1.9305607 2.723848 3.223230 3.447039 3.524954 3.925053 3.603917 3.5162589 VIT_12s0059g01080
VIT_01s0011g06030 4.5789387 4.5881050 4.6085077 4.762345 4.882843 4.968347 4.803379 4.762216 5.136649 5.1232551 VIT_01s0011g06030
VIT_11s0016g03200 3.9843645 4.6828055 4.2180888 5.663218 6.160208 6.217290 6.060040 6.172746 6.412156 6.5272502 VIT_11s0016g03200
ggplot(melt(pn.12.cleaned.log[rownames(pn.12.cleaned.log) %in% sixtytwo, ], id.vars="ID"), aes(x=variable, y=value, group=ID)) + geom_line() + theme_bw() + ggtitle("pn12 sixtytwo log2 FPKM")

is.na(pn.12.cleaned.log) <- sapply(pn.12.cleaned.log, is.infinite)
heatmap.2(as.matrix(pn.12.cleaned.log[rownames(pn.12.cleaned.log) %in% sixtytwo, c(1:10) ]), Colv=F, trace="none",margins = c(8, 16), cexRow=0.4)

Data transformed by Clust

options(knitr.kable.NA = '')
## load data processed by Clust from single dataframe analysis
pn.12.processed.single <- read.table("X:/BIOINFORMATICS/Dropbox/PETER/METAQTL/QTLDB/analisi_mqtl/analisi_mqtl_070317/dataset_trascrittomica/nuovi_dataset/cluster_analysis-heatmap/clust/Results/Before-correction/Results_28_Nov_17/Results_28_Nov_17_PN-2012/Processed_Data/pn_2012.txt_processed.tsv", head=T, sep="\t", na.strings="")
dim(pn.12.processed.single)
## [1] 5223   11
kable(head(pn.12.processed.single[pn.12.processed.single$Genes %in% thirteen, ]),align="l")
Genes t0 t1 t2 t3 t4 t5 t6 t7 t8 t9
816 VIT_02s0025g01610 -1.2685294 -1.2146402 -1.2621850 0.2547503 1.939204 1.1392424 0.4441945 -0.0301788 -0.0217308 0.0198727
1490 VIT_03s0063g01870 -1.4768386 -0.6324725 -1.4149163 0.8093506 1.139652 1.2577611 1.1086311 0.0990995 -0.0944560 -0.7958107
1625 VIT_03s0091g00240 -1.3130570 -0.7394178 -0.4419257 1.1067932 1.674054 0.9832702 0.9005157 -0.4057954 -0.8215623 -0.9428748
2828 VIT_12s0059g00670 -0.6094980 -0.8946531 -1.0543887 0.1028155 1.464734 1.4937000 1.0919396 0.3773177 -0.8275707 -1.1443961
3074 VIT_14s0006g01660 -0.6352968 -0.8455768 -1.0250977 1.4035965 1.328436 0.9842226 0.4522699 0.5975094 -1.2060074 -1.0540558
3676 VIT_16s0098g00790 -1.2518436 -1.1961095 -1.2897724 0.1529066 1.608353 1.4302411 0.7463269 0.1646176 -0.3094673 -0.0552529
ggplot(melt(pn.12.processed.single[pn.12.processed.single$Genes %in% thirteen, ], id.vars="Genes"), aes(x=variable, y=value, group=Genes)) + geom_line() + theme_bw() + ggtitle("pn12 thirteen by clust single datasets")

rownames(pn.12.processed.single) <- pn.12.processed.single$Genes
heatmap.2(as.matrix(pn.12.processed.single[rownames(pn.12.processed.single) %in% thirteen, -1]), Colv=F, trace="none",margins = c(8, 16))

## load data processed by Clust from multiple dataframe analysis
pn.12.processed.together <- read.table("X:/BIOINFORMATICS/Dropbox/PETER/METAQTL/QTLDB/analisi_mqtl/analisi_mqtl_070317/dataset_trascrittomica/nuovi_dataset/cluster_analysis-heatmap/clust/Results/After-correction/Results_06_Dec_17/Results_06_Dec_17_pn/Processed_Data/pn_2012.txt_processed.tsv", head=T, sep="\t", na.strings="")
kable(head(pn.12.processed.together[pn.12.processed.together$Genes %in% sixtytwo, ]),align="l")
Genes t.2 t.1 t0 t1 t2 t3 t4 t5 t6
40 VIT_00s1338g00020 -1.7561878 -1.699453 -0.0412998 0.0947518 0.8082985 0.8215968 0.3620912 1.2357449 0.1744579
63 VIT_01s0011g00710 -1.3770076 -1.975395 -0.0003789 1.1926859 0.7237330 0.9062470 0.5469846 0.2511392 -0.2680081
102 VIT_01s0011g01290 -1.2561277 -1.731250 -0.8276336 0.0723928 0.8912415 1.4644438 0.3075506 0.7832809 0.2961015
120 VIT_01s0011g01520 -0.8440816 -2.189173 0.6724090 0.8727039 1.2559142 0.5960390 0.2692533 -0.1218086 -0.5112558
185 VIT_01s0011g02310 -1.5573986 -1.825716 -0.3402707 0.1635896 0.7572557 1.1911618 0.8209424 0.6402520 0.1501832
293 VIT_01s0011g04050 -1.0470033 -1.105261 -1.3640637 -0.2150218 0.5371196 0.3529074 -0.0424109 1.1236872 1.7600468
ggplot(melt(pn.12.processed.together[pn.12.processed.together$Genes %in% sixtytwo, ], id.vars="Genes"), aes(x=variable, y=value, group=Genes)) + geom_line() + theme_bw() + ggtitle("pn12 sixtytwo by clust all 3 years")

rownames(pn.12.processed.together) <- pn.12.processed.together$Genes
heatmap.2(as.matrix(pn.12.processed.together[rownames(pn.12.processed.together) %in% sixtytwo, -1 ]), Colv=F, trace="none",margins = c(8, 16), cexRow=0.4)

We decided to focus our analysis on the clusters produced by the analysis of all the 3 Pinot datasets together, that is the 62 up and 577 down regulated genes obtained by running Clust with all the 3 ds together
Now I retrieve the down-regulated genes as obtained by Clust on the 3 years datasets integrated analysis

## this file is the same as pn.up.together but I load it again anyway
setwd("X:/BIOINFORMATICS/Dropbox/PETER/METAQTL/QTLDB/analisi_mqtl/analisi_mqtl_070317/dataset_trascrittomica/nuovi_dataset/cluster_analysis-heatmap/clust/Results/After-correction/Results_06_Dec_17/Results_06_Dec_17_pn/")
pn.down.together <- read.table("Clusters_Objects.tsv", head=T, sep="\t", na.strings="")
pn.down.together <- pn.down.together[2:nrow(pn.down.together),]
dim(pn.down.together)
## [1] 565   9

Among these clusters the profiles we are interested in are the ones from clusters C7 and C8

options(knitr.kable.NA = '')
## I already know that these are 577 =) 
five77 <- sort(unique(as.vector(as.matrix(pn.down.together[,c(8,9)]))))
kable(head(five77),align="l")
VIT_00s0229g00100
VIT_00s0229g00180
VIT_00s0965g00010
VIT_01s0011g00590
VIT_01s0011g00610
VIT_01s0011g00660
## Who are these genes?
kable(head(new_afi_definitive[new_afi_definitive$gene %in% five77 , c(1,2,3,90:97)]), align="l")
gene chr start chr_position_12X annotation_V1 DEG_common switch_common NAC DEG_atlas switch_atlas marker_transition
VIT_01s0011g00590 1 543821 chr01_00543821_00549831 Acclimation of photosynthesis to environment DEG_common DEG_atlas
VIT_01s0011g00610 1 558179 chr01_00558179_00562815 Unknown protein DEG_common
VIT_01s0011g00660 1 590563 chr01_00590563_00592859 Pentatricopeptide (PPR) repeat
VIT_01s0011g00730 1 649069 chr01_00649069_00654392 Alpha-1,4-glucan phosphorylase type H DEG_atlas
VIT_01s0011g00930 1 793778 chr01_00793778_00796517 FK506-binding protein genes family (VvFKBP13) DEG_common DEG_atlas marker_transition
VIT_01s0011g01320 1 1121407 chr01_01121407_01125878 Cholinephosphate cytidylyltransferase

Plot of data processed by Clust from multiple dataframe analysis

pn.12.processed.together <- read.table("X:/BIOINFORMATICS/Dropbox/PETER/METAQTL/QTLDB/analisi_mqtl/analisi_mqtl_070317/dataset_trascrittomica/nuovi_dataset/cluster_analysis-heatmap/clust/Results/After-correction/Results_06_Dec_17/Results_06_Dec_17_pn/Processed_Data/pn_2012.txt_processed.tsv", head=T, sep="\t", na.strings="")
ggplot(melt(pn.12.processed.together[pn.12.processed.together$Genes %in% five77, ], id.vars="Genes"), aes(x=variable, y=value, group=Genes)) + geom_line() + theme_bw() + ggtitle("pn12 five77 by clust all 3 years")

rownames(pn.12.processed.together) <- pn.12.processed.together$Genes
heatmap.2(as.matrix(pn.12.processed.together[rownames(pn.12.processed.together) %in% five77, -1 ]), Colv=F, trace="none",margins = c(8, 16), cexRow=0.3)

Gene Ontology of genes in clusters

Gene Ontology from BiomaRt

options(knitr.kable.NA = '')
plant<-useMart("plants_mart",dataset="vvinifera_eg_gene", host="plants.ensembl.org")
attributes <- listAttributes(plant)
go_attributes <- attributes[27:33,1]
kable(go_attributes, align="l")
go_id
name_1006
definition_1006
go_linkage_type
namespace_1003
goslim_goa_accession
goslim_goa_description
sixtytwo.go <- getBM(attributes=c('ensembl_gene_id','chromosome_name', go_attributes),filters ='ensembl_gene_id', values = sixtytwo, mart = plant)
## No encoding supplied: defaulting to UTF-8.
DT::datatable(sixtytwo.go, rownames = F, filter = "top")
ggplot(sixtytwo.go, aes(fct_infreq(go_id))) + geom_bar() +  theme(axis.text.x=element_text(angle=90,hjust=1))

#ggplot(c1, aes(fct_infreq(goslim_goa_accession))) + geom_bar()
#c1[grepl("GO:0016020",c1$go_id),]